tmp$alpha <- alpha_est
setkey(tmp, person, session)
## Loop over sessions
out_list <- list()
i<-4
for(i in 2:length(levels(tmp$session))){
data_t_1 <- tmp[session == levels(tmp$session)[i]]
data_t_0 <- tmp[session == levels(tmp$session)[i-1]]
data_t_1$in_previous <- data_t_1$person %in% data_t_0$person
data_t_0$in_next <- data_t_0$person %in% data_t_1$person
mean_remain_men_t_0 <- mean(data_t_0[in_next == TRUE & gender == "Male"]$alpha)
mean_remain_men_t_1 <- mean(data_t_1[in_previous == TRUE & gender == "Male"]$alpha)
mean_remain_women_t_0 <- mean(data_t_0[in_next == TRUE & gender == "Female"]$alpha)
mean_remain_women_t_1 <- mean(data_t_1[in_previous == TRUE & gender == "Female"]$alpha)
mean_leavers_men_t_0 <- mean(data_t_0[in_next == FALSE & gender == "Male"]$alpha)
mean_leavers_women_t_0 <- mean(data_t_0[in_next == FALSE & gender == "Female"]$alpha)
mean_joiners_men_t_1 <- mean(data_t_1[in_previous == FALSE & gender == "Male"]$alpha)
mean_joiners_women_t_1 <- mean(data_t_1[in_previous == FALSE & gender == "Female"]$alpha)
frac_remainers_men_t_0 <- mean(data_t_0[gender == "Male"]$in_next)
frac_remainers_men_t_1 <- mean(data_t_1[gender == "Male"]$in_previous)
frac_remainers_women_t_0 <- mean(data_t_0[gender == "Female"]$in_next)
frac_remainers_women_t_1 <- mean(data_t_1[gender == "Female"]$in_previous)
## Remove NANs
if(is.nan(mean_remain_men_t_0)) mean_remain_men_t_0 <- 0
if(is.nan(mean_remain_men_t_1)) mean_remain_men_t_1 <- 0
if(is.nan(mean_remain_women_t_0)) mean_remain_women_t_0 <- 0
if(is.nan(mean_remain_women_t_1)) mean_remain_women_t_1 <- 0
if(is.nan(mean_leavers_men_t_0)) mean_leavers_men_t_0 <- 0
if(is.nan(mean_leavers_women_t_0)) mean_leavers_women_t_0 <- 0
if(is.nan(mean_joiners_men_t_1)) mean_joiners_men_t_1 <- 0
if(is.nan(mean_joiners_women_t_1)) mean_joiners_women_t_1 <- 0
if(is.nan(frac_remainers_men_t_0)) frac_remainers_men_t_0 <- 0
if(is.nan(frac_remainers_men_t_1)) frac_remainers_men_t_1 <- 0
if(is.nan(frac_remainers_women_t_0)) frac_remainers_women_t_0 <- 0
if(is.nan(frac_remainers_women_t_1)) frac_remainers_women_t_1 <- 0
mean_men_t_0 <- (frac_remainers_men_t_0 * mean_remain_men_t_0) +
((1 - frac_remainers_men_t_0) * mean_leavers_men_t_0)
mean_women_t_0 <- (frac_remainers_women_t_0 * mean_remain_women_t_0) +
((1 - frac_remainers_women_t_0) * mean_leavers_women_t_0)
mean_men_t_1 <- (frac_remainers_men_t_1 * mean_remain_men_t_1) +
((1 - frac_remainers_men_t_1) * mean_joiners_men_t_1)
mean_women_t_1 <- (frac_remainers_women_t_1 * mean_remain_women_t_1) +
((1 - frac_remainers_women_t_1) * mean_joiners_women_t_1)
socialization_effect_men <- (frac_remainers_men_t_1 * mean_remain_men_t_1) - (frac_remainers_men_t_0 * mean_remain_men_t_0)
socialization_effect_women <- (frac_remainers_women_t_1 * mean_remain_women_t_1) - (frac_remainers_women_t_0 * mean_remain_women_t_0)
replacement_effect_men <- ((1 - frac_remainers_men_t_1) * mean_joiners_men_t_1) - ((1 - frac_remainers_men_t_0) * mean_leavers_men_t_0)
replacement_effect_women <- ((1 - frac_remainers_women_t_1) * mean_joiners_women_t_1) - ((1 - frac_remainers_women_t_0) * mean_leavers_women_t_0)
socialisation_effect_diff <-  socialization_effect_women - socialization_effect_men
replacement_effect_diff <- replacement_effect_women - replacement_effect_men
out_chain[it,i-1,] <- c(mean_men_t_0, mean_men_t_1, mean_women_t_0, mean_women_t_1, socialization_effect_men, socialization_effect_women, socialisation_effect_diff, replacement_effect_men, replacement_effect_women, replacement_effect_diff)
}
}
return(out_chain)
}
out_chains <- lapply(1:length(vars), function(x) get_replacement_socialisation_effects_chain(x))
socialisation_effect_diffs_men <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,5]),c(0.025,.5,.975))))
replacement_effect_diffs_men <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,8]),c(0.025,.5,.975))))
socialisation_effect_diffs_women <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,6]),c(0.025,.5,.975))))
replacement_effect_diffs_women <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,9]),c(0.025,.5,.975))))
socialisation_effect_diffs_all <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,7]),c(0.025,.5,.975))))
replacement_effect_diffs_all <- do.call("rbind",lapply(out_chains,function(x) quantile(rowMeans(x[,,10]),c(0.025,.5,.975))))
xlims <- range(c(socialisation_effect_diffs_men, replacement_effect_diffs_men,
socialisation_effect_diffs_women, replacement_effect_diffs_women,
socialisation_effect_diffs_all, replacement_effect_diffs_all))
pdf("analysis/plots/model2_replacement_socialisation_decomposition_average_uncertainty.pdf", 15, 4.5)
shift <- .1
ylims <- range(c((length(vars):1)) - shift, (length(vars):1)+shift)
par(mfrow = c(1,3), mar = c(5,8,4,2)+0.1, cex = .85)
plot(x = socialisation_effect_diffs_men[,2], y = (length(vars):1)+shift, xlim = xlims, pch = 19, ylab = "", yaxt = "n", xlab = "Average change in male style over time", main = "Male MPs", ylim = ylims)
points(x = replacement_effect_diffs_men[,2], y = (length(vars):1)-shift, pch = 1)
abline(v = 0, lty = 3)
segments(x0 = socialisation_effect_diffs_men[,1], x1 = socialisation_effect_diffs_men[,3], y0 = (length(vars):1)+shift)
segments(x0 = replacement_effect_diffs_men[,1], x1 = replacement_effect_diffs_men[,3], y0 = (length(vars):1)-shift, lty =2)
axis(2, at = length(vars):1, labels = var_names, las = 2)
legend("topright", legend = c("Replacement", "Within-MP"), pch = c(1, 19), bty = "n" , cex = 1)
plot(x = socialisation_effect_diffs_women[,2], y = (length(vars):1)+shift, xlim = xlims, pch = 19, ylab = "", yaxt = "n", xlab = "Average change in female style over time", main = "Female MPs", ylim = ylims)
points(x = replacement_effect_diffs_women[,2], y = (length(vars):1)-shift, pch = 1)
abline(v = 0, lty = 3)
segments(x0 = socialisation_effect_diffs_women[,1], x1 = socialisation_effect_diffs_women[,3], y0 = (length(vars):1)+shift)
segments(x0 = replacement_effect_diffs_women[,1], x1 = replacement_effect_diffs_women[,3], y0 = (length(vars):1)-shift, lty =2)
axis(2, at = length(vars):1, labels = var_names, las = 2)
legend("topright", legend = c("Replacement", "Within-MP"), pch = c(1, 19), bty = "n" , cex = 1)
plot(x = socialisation_effect_diffs_all[,2], y = (length(vars):1)+shift, xlim = xlims, pch = 19, ylab = "", yaxt = "n", xlab = "Female effect - male effect", main = "Female MPs - Male MPs", ylim = ylims)
points(x = replacement_effect_diffs_all[,2], y = (length(vars):1)-shift, pch = 1)
abline(v = 0, lty = 3)
segments(x0 = socialisation_effect_diffs_all[,1], x1 = socialisation_effect_diffs_all[,3], y0 = (length(vars):1)+shift)
segments(x0 = replacement_effect_diffs_all[,1], x1 = replacement_effect_diffs_all[,3], y0 = (length(vars):1)-shift, lty =2)
axis(2, at = length(vars):1, labels = var_names, las = 2)
legend("topright", legend = c("Replacement", "Within-MP"), pch = c(1, 19), bty = "n" , cex = 1)
dev.off()
####  Model 1 and 2 - covariate analysis ####
load("working/stan_out/standata_model2.Rdata")
get_beta_effects <- function(v = "affect_std"){
load(paste0("working/stan_out/model2_control/",v,"_posterior.Rdata"))
params <- extract(posterior)
out <- data.frame(style = var_names[vars==v], var = metadata$labels_X_ind, t(apply(params$Beta,2,quantile, c(0.025, 0.5, 0.975))))
names(out) <- c("style", "var", "lo", "est", "hi")
return(out)
}
out <- do.call("rbind", lapply(vars, get_beta_effects))
out$var <- as.character(out$var)
out$var[out$var == "age"] <- "Age"
out$var[out$var == "frontbench"] <- "Frontbench"
out$var[out$var == "committee_chair"] <- "Com. Chair"
out$var[out$var == "Other"] <- "Other party"
out$var[out$var == "LibDem"] <- "Lib Dem"
out$var[out$var == "gov_mp"] <- "Government MP"
out$var[out$var == "ld_gov_mp"] <- "LD * Government MP"
out$var[out$var == "lab_gov_mp"] <- "Lab * Government MP"
out$var[out$var == "margin"] <- "Margin of Victory"
out$var[out$var == "business"] <- "Occ. (business)"
out$var[out$var == "political"] <- "Occ. (political)"
out$var[out$var == "professional"] <- "Occ. (professional)"
out$var[out$var == "manual"] <- "Occ. (manual)"
out$var[out$var == "has_degree"] <- "University"
plot_beta <- function(style = "Affect"){
tmp <- out[out$style == style,]
tmp$y <- length(tmp$var):1
plot(tmp$est, tmp$y,
xlim = range(c(tmp$lo, tmp$hi)),
pch = 19,
xlab = expression(beta),
ylab = "",
yaxt = "n",
main = style)
axis(2, at = tmp$y, labels = tmp$var, las = 2)
segments(x0 = tmp$lo, x1 = tmp$hi, y0 = tmp$y)
abline(v = 0 , lty = 3)
abline(h = tmp$y , lty = 3, col = alpha("black",.2))
}
pdf("analysis/plots/model2_covariate_effects.pdf",12,5)
par(mfrow = c(2,4), mar = c(5,8.5,2,2)+0.1)
for(style in unique(out$style)) plot_beta(style)
dev.off()
####  Model 2 - R-squared ####
load("working/stan_out/standata_model2.Rdata")
get_gender_r2_time <- function(outcome = "affect_std"){
load(paste0("working/stan_out/model2/",outcome,"_posterior.Rdata"))
params <- extract(posterior)
## R-squared
alpha <- params$alpha
mu_0 <- params$alpha_intercept
mu_1 <- mu_0 + params$alpha_gender_effect
fitted_alpha <- matrix(NA, nrow(alpha), ncol(alpha))
for(i in 1:nrow(fitted_alpha)) fitted_alpha[i,] <- ifelse(standata$Ri_gender == 0, mu_0[i,standata$Ri_term], mu_1[i,standata$Ri_term])
eps <- alpha - fitted_alpha
r_squared <- rep(NA, standata$N_terms)
for(y in 1:standata$N_terms) r_squared[y] <- 1- (mean(apply(eps[,standata$Ri_term==y],1,var)) / mean(apply(alpha[,standata$Ri_term==y],1,var)))
return(data.frame(term = 1:standata$N_terms, r_squared))
}
out_list <- lapply(vars, function(x) get_gender_r2_time(x))
out <- do.call("cbind", out_list)[,seq(2,dim(do.call("cbind", out_list))[2],2)]
names(out) <- vars
r_squared_time <- out
save(r_squared_time, file = "working/r_squared_time.Rdata")
####  Model 2 -  Debate effect analyses ####
speech_scores$R_debate_id <- as.numeric(as.factor(as.character(speech_scores$section_id)))
debate_titles <- speech_scores[,list(title = unique(parent), fact_std = mean(fact_std), debate_type = unique(debate_type)),by = R_debate_id]
levels(debate_titles$debate_type) <- c("Other", "QT", "PMQ", "Procedural", "Opp/Backbench", "Legislation")
setkey(debate_titles, R_debate_id)
plot_debate_effects <- function(outcome = "affect_std"){
load(paste0("working/stan_out/model2/",outcome,"_posterior.Rdata"))
params <- extract(posterior)
debate_titles$y_tmp <- apply(params$delta,2,mean)
tmp_mod <- lm(y_tmp ~ debate_type - 1, debate_titles)
coefs <- coef(summary(tmp_mod))
coefs <- data.frame(coefs)
coefs$hi <- coefs$Estimate + 1.96*coefs$Std..Error
coefs$lo <- coefs$Estimate - 1.96*coefs$Std..Error
plot(coefs[,1],1:nrow(coefs), xlim = range(c(coefs$hi, coefs$lo)), pch = 19,
main = var_names[vars == outcome], yaxt = "n", ylab = "", xlab = "Debate-type mean")
segments(x0 = coefs$lo, x1 = coefs$hi, y0 = 1:nrow(coefs))
axis(2, at = 1:nrow(coefs), gsub("debate_type","",rownames(coefs)), las = 2)
abline(h = 1:nrow(coefs), lty = 3, col = alpha("black", .2))
}
pdf("analysis/plots/model2_debate_effects.pdf", width = 10, height = 5)
par(mfrow = c(2,4), mar = c(5,7,4,2)+0.1)
for(i in 1:length(vars)) plot_debate_effects(vars[i])
dev.off()
##### #############################################
#####                                        ######
#####       Analyse topic model              ######
#####                                        ######
##### #############################################
rm(list=ls())
library(quanteda) # v.3.0.0
library(stm) # v.1.3.6
library(data.table) # v.1.13.6
library(scales)
library(sandwich)
# Load data
load("data/debates.Rdata")
load("working/debate_dfm_stm.Rdata")
# Cut 1992-1997 session & speaker
debates <- debates[parliamentary_term != "1992-1997"]
debates <- debates[is_speaker == FALSE]
# Collapse to speaker in debate level
text_by_mp_in_debate <- debates[,list(body = paste0(body, collapse =  " "),
gender = unique(gender),
year = unique(year),
parliamentary_term = unique(parliamentary_term)),
by = list(section_id,person_id)]
affect_list <- list()
posemo_list <- list()
negemo_list <- list()
fact_list <- list()
anecdote_list <- list()
aggression_list <- list()
complexity_list <- list()
repetition_list <- list()
i <- 0
topic_numbers <- seq(10, 80, 10)
for(k in topic_numbers){
i <- i + 1
load(paste0("working/stm_out/stm_out_",k,".Rdata"))
# Extract topics & merge
topic_proportions <- data.frame(stm_object$theta)
topic_labels <- paste0("topic_",1:ncol(topic_proportions))
names(topic_proportions) <- topic_labels
out <- data.frame(docvars(debate_dfm_stm)[rowSums(debate_dfm_stm) != 0,], topic_proportions)
parlimentary_term <- text_by_mp_in_debate[,list(parliamentary_term = unique(parliamentary_term)),
by = list(person_id, section_id)]
out <- merge(out, parlimentary_term, by = c("person_id","section_id"))
### Merge in style types -----
load("working/speech_scores.Rdata")
speech_scores <- speech_scores[parliamentary_term != "1992-1997"]
speech_scores <- speech_scores[is_speaker == FALSE]
speech_scores <- merge(out, speech_scores, by = c("person_id","section_id"))
model_formula <- paste0(paste0("gender.x * ", topic_labels[-1]), collapse = " + ")
# Models
affect <- lm(paste0("affect_std ~ ", model_formula), data = speech_scores)
posemo <- lm(paste0("posemo_std ~ ", model_formula), data = speech_scores)
negemo <- lm(paste0("negemo_std ~ ", model_formula), data = speech_scores)
fact <- lm(paste0("fact_std ~ ", model_formula), data = speech_scores)
anecdote <- lm(paste0("anecdote_std ~ ", model_formula), data = speech_scores)
aggression <- lm(paste0("aggression_std ~ ", model_formula), data = speech_scores)
complexity <- lm(paste0("complexity_std ~ ", model_formula), data = speech_scores)
repetition <- lm(paste0("repetition_std ~ ", model_formula), data = speech_scores)
topic_gender_gaps <- function(model = affect, ...){
model_coefs <- coef(model)
model_summary <- summary(model)
model_vcov <- vcov(model)
baseline <- model_coefs[2]
baseline_hi <- baseline + 1.96 * coef(model_summary)[2,2]
baseline_lo <- baseline - 1.96 * coef(model_summary)[2,2]
# Extract interaction terms
interaction_terms <- grep("\\:", names(model_coefs))
# Calculate marginal effects
marginal_effects <- baseline + model_coefs[interaction_terms]
# Calculate standard errors for sums of coefficients
marginal_effect_ses <- sapply(1:length(interaction_terms),function(x) sqrt(model_vcov[2,2] + model_vcov[interaction_terms[x],interaction_terms[x]] + (2*model_vcov[2,interaction_terms[x]])))
# Upper and lower confidence intervals
marginal_effects_hi <- marginal_effects + 1.96 * marginal_effect_ses
marginal_effects_lo <- marginal_effects - 1.96 * marginal_effect_ses
# Combine all into a data frame
out <- data.frame(topic_labels = topic_labels,
est = c(baseline, marginal_effects),
hi = c(baseline_hi, marginal_effects_hi),
lo = c(baseline_lo, marginal_effects_lo),
z = c(baseline/coef(model_summary)[2,2], marginal_effects/marginal_effect_ses),
row.names = NULL)
out <- out[order(out$z, decreasing = F),]
out$y <- 1:nrow(out)
return(out)
}
affect_diffs <- topic_gender_gaps(affect, main = "Affect")
negemo_diffs <- topic_gender_gaps(negemo, main = "Negative Emotion")
posemo_diffs <- topic_gender_gaps(posemo, main = "Positive Emotion")
fact_diffs <- topic_gender_gaps(fact, main = "Fact")
anecdote_diffs <- topic_gender_gaps(anecdote, main = "Human Narrative")
aggression_diffs <- topic_gender_gaps(aggression, main = "Aggression")
complexity_diffs <- topic_gender_gaps(complexity, main = "Complexity")
repetition_diffs <- topic_gender_gaps(repetition, main = "Repetition")
## Predict gender gap for each topic for each style
speech_scores <- data.table(speech_scores)
words_topic_month <- data.frame(speech_scores[,lapply(.SD, function(x) sum(x * n_words)),by = yearmon,.SDcols = paste0("topic_",1:length(topic_labels))])
words_topic_month <- reshape2::melt(words_topic_month, id.vars = c("yearmon"))
words_topic_month$affect_gap <- affect_diffs[match(words_topic_month$variable, repetition_diffs$topic_labels),]$z
words_topic_month$negemo_gap <- negemo_diffs[match(words_topic_month$variable, repetition_diffs$topic_labels),]$z
words_topic_month$posemo_gap <- posemo_diffs[match(words_topic_month$variable, repetition_diffs$topic_labels),]$z
words_topic_month$fact_gap <- fact_diffs[match(words_topic_month$variable, repetition_diffs$topic_labels),]$z
words_topic_month$anecdote_gap <- anecdote_diffs[match(words_topic_month$variable, repetition_diffs$topic_labels),]$z
words_topic_month$aggression_gap <- aggression_diffs[match(words_topic_month$variable, repetition_diffs$topic_labels),]$z
words_topic_month$complexity_gap <- complexity_diffs[match(words_topic_month$variable, repetition_diffs$topic_labels),]$z
words_topic_month$repetition_gap <- repetition_diffs[match(words_topic_month$variable, repetition_diffs$topic_labels),]$z
z <- rep(NA, length(topic_labels))
for(t in 1:length(topic_labels)) z[t] <- coef(summary(lm(value ~ yearmon, words_topic_month, subset = words_topic_month$variable == topic_labels[t])))[2,3]
affect_diffs$z_time <- z[match(affect_diffs$topic_labels, topic_labels)]
posemo_diffs$z_time <- z[match(posemo_diffs$topic_labels, topic_labels)]
negemo_diffs$z_time <- z[match(negemo_diffs$topic_labels, topic_labels)]
fact_diffs$z_time <- z[match(fact_diffs$topic_labels, topic_labels)]
anecdote_diffs$z_time <- z[match(anecdote_diffs$topic_labels, topic_labels)]
aggression_diffs$z_time <- z[match(aggression_diffs$topic_labels, topic_labels)]
complexity_diffs$z_time <- z[match(complexity_diffs$topic_labels, topic_labels)]
repetition_diffs$z_time <- z[match(repetition_diffs$topic_labels, topic_labels)]
var <- "affect"
plot_gap_vs_time <- function(var,...){
plot(get(paste0(var,"_diffs"))$z, get(paste0(var,"_diffs"))$z_time, bty = "n", pch = 19, ylab = "Effect of time on topic prevalence", xlab = "Gender gap on topic", ...)
abline(v = 0, lty = 3)
abline(h = 0, lty = 3)
model <- lm(z_time ~ z, data = get(paste0(var,"_diffs")))
abline(model,
col = ifelse(coef(summary(model))[2,4] < 0.05, "red", "gray"),
lty = ifelse(coef(summary(model))[2,4] < 0.05, 1, 2),
lwd = 2)
return(model)
}
pdf(paste0("analysis/plots/topic/gap_vs_time_",k,".pdf"),12,6)
par(mfrow = c(2,4))
affect_list[[i]] <- plot_gap_vs_time("affect", main = "Affect")
posemo_list[[i]] <- plot_gap_vs_time("posemo", main = "Positive Emotion")
negemo_list[[i]] <- plot_gap_vs_time("negemo", main = "Negative Emotion")
fact_list[[i]] <- plot_gap_vs_time("fact", main = "Fact")
anecdote_list[[i]] <- plot_gap_vs_time("anecdote", main = "Human Narrative")
aggression_list[[i]] <- plot_gap_vs_time("aggression", main = "Aggression")
complexity_list[[i]] <- plot_gap_vs_time("complexity", main = "Complexity")
repetition_list[[i]] <- plot_gap_vs_time("repetition", main = "Repetition")
dev.off()
}
plot_coef_list <- function(tmp_list = affect_list,...){
x <- lapply(tmp_list, function(x) {
tmp <- coef(summary(x))
est <- tmp[2,1]
se <- tmp[2,2]
hi <- est + 1.96 * se
lo <- est - 1.96 * se
sig <- ifelse(abs(est/se) >= 1.96, TRUE, FALSE)
data.frame(est,se,hi,lo, sig)
}
)
x <- do.call("rbind", x)
x$k <- topic_numbers
plot(x = x$k, y = x$est, pch = ifelse(x$sig,19,19),
xlab = "Number of topics",
col = ifelse(x$sig,"black",alpha("black", .2)),
ylab = "Estimate", bty = "n", cex = 1.2, ylim = c(-2, 2),...)
segments(x0 = x$k, y0 = x$lo, y1 = x$hi, col = ifelse(x$sig,"black",alpha("black", .2)), lwd = 2)
abline(h = 0, lty = 2)
}
pdf(paste0("analysis/plots/topic/gap_vs_time_all_topics.pdf"),12,6)
par(mfrow = c(2,4))
plot_coef_list(affect_list, main = "Affect")
plot_coef_list(posemo_list, main = "Positive Emotion")
plot_coef_list(negemo_list, main = "Negative Emotion")
plot_coef_list(fact_list, main = "Fact")
plot_coef_list(anecdote_list, main = "Human Narrative")
plot_coef_list(aggression_list, main = "Aggression")
plot_coef_list(complexity_list, main = "Complexity")
plot_coef_list(repetition_list, main = "Repetition")
dev.off()
rm(list = ls())
##################################
##                              ##
##         Diagnostics          ##
##                              ##
##################################
# Top sentences for each style
load("working/dictionaries_sentence.Rdata") # Sentence-level complexity scores
load("working/complexity_sentence.Rdata") # Sentence-level complexity scores
load("working/repetition_sentence.Rdata") # Sentence-level repetition scores
load("working/speech_scores.Rdata")
load("data/debates.Rdata") # Raw debate information
stdize <- function(x) (x - mean(x, na.rm = T))/sd(x, na.rm = T)
library(plm)
complexity_scores$complexity_std <- stdize(complexity_scores$complexity)
dictionary_scores$complexity_std <- complexity_scores$complexity_std
dictionary_scores$repetition_std <- repetition_scores$repetition_std
dictionary_scores <- merge(dictionary_scores, debates[,c("is_speaker","epobject_id")], by = "epobject_id")
dictionary_scores[,glove_affect_prop:=glove_affect/n_words,]
dictionary_scores[,glove_posemo_prop:=glove_posemo/n_words]
dictionary_scores[,glove_negemo_prop:=glove_negemo/n_words]
dictionary_scores[,glove_fact_prop:=glove_fact/n_words]
dictionary_scores[,glove_anecdote_prop:=glove_anecdote/n_words]
dictionary_scores[,glove_anecdote_names_only_prop:=glove_anecdote_names_only/n_words]
dictionary_scores[,glove_anecdote_seed_only_prop:=glove_anecdote_seed_only/n_words]
dictionary_scores[,glove_aggression_prop:=glove_aggression/n_words]
min_words <- 20
max_words <- 120
dictionary_scores_tmp <- dictionary_scores[n_words>=min_words & n_words <= max_words]
dictionary_scores_tmp <- dictionary_scores_tmp[is_speaker == F]
top_affect <- dictionary_scores_tmp[order(glove_affect_prop, decreasing = T)[1:150]]
top_posemo <- dictionary_scores_tmp[order(glove_posemo_prop, decreasing = T)[1:150]]
top_negemo <- dictionary_scores_tmp[order(glove_negemo_prop, decreasing = T)[1:150]]
top_fact <- dictionary_scores_tmp[order(glove_fact_prop, decreasing = T)[1:150]]
top_anecdote <- dictionary_scores_tmp[order(glove_anecdote_prop, decreasing = T)[1:150]]
top_aggression <- dictionary_scores_tmp[order(glove_aggression_prop, decreasing = T)[1:150]]
top_complexity <- dictionary_scores_tmp[order(complexity_std, decreasing = T)[1:150]]
top_repetition <- dictionary_scores_tmp[order(repetition_std, decreasing = T)[1:150]]
top_affect <- top_affect[!duplicated(top_affect$sent)]
top_posemo <- top_posemo[!duplicated(top_posemo$sent)]
top_negemo <- top_negemo[!duplicated(top_negemo$sent)]
top_fact <- top_fact[!duplicated(top_fact$sent)]
top_anecdote <- top_anecdote[!duplicated(top_anecdote$sent)]
top_aggression <- top_aggression[!duplicated(top_aggression$sent)]
top_complexity <- top_complexity[!duplicated(top_complexity$sent)]
top_repetition <- top_repetition[!duplicated(top_repetition$sent)]
save(top_affect, top_posemo, top_negemo, top_fact, top_anecdote, top_aggression, top_repetition, top_complexity, file = "working/top_sentences.Rdata")
##################################
##                              ##
##         Analysis             ##
##                              ##
##################################
## Debate type models
style_types <- c("affect_std","posemo_std", "negemo_std", "fact_std", "anecdote_std", "aggression_std", "complexity_std", "repetition_std")
style_type_labels <- c("Emotion", "Pos. Emotion", "Neg. Emotion", "Fact", "Anecdote", "Aggression", "Complexity", "Repetition")
covariate_formula <- "~ gender + party + age_years + attends_cabinet + attends_shadow_cabinet + is_gov_minister + is_opp_minister + is_committee_chair + years_in_parliament"
covariate_labels <- c("Intercept", "Female", "Party: Green", "Party: Labour",
"Party: Lib Dem", "Party: Other",
"Party: SNP", "Age (Years)", "Attends Cabinet", "Attends Shadow Cabinet",
"Gov Minister", "Shadow Minister", "Committee Chair",
"Years in Parliament")
est_conf_ints <- function(x = bivariate_model){
coefs <- coef(summary(x))
est <- coefs[,1]
se <- coefs[,2]
hi <- est + 1.96 * se
lo <- est - 1.96 * se
return(data.frame(est, hi, lo))
}
coef_list <- list()
all_list <- list()
questions_list <- list()
PMQs_list <- list()
procedure_list <- list()
legislation_list <- list()
opp_bbb_list <- list()
other_list <- list()
i<-0
for(style in style_types){
print(style)
i <- i+1
all_model <- lm(as.formula(paste0(style, "~ gender")),
data = speech_scores,
weights = speech_scores$n_words)
questions_model <- lm(as.formula(paste0(style, "~ gender")),
data = speech_scores[speech_scores$debate_type=="Questions"],
weights = speech_scores$n_words[speech_scores$debate_type=="Questions"])
PMQs_model <- lm(as.formula(paste0(style, "~ gender")),
data = speech_scores[speech_scores$debate_type=="PMQs"],
weights = speech_scores$n_words[speech_scores$debate_type=="PMQs"])
procedure_model <- lm(as.formula(paste0(style, "~ gender")),
data = speech_scores[speech_scores$debate_type=="Procedure"],
weights = speech_scores$n_words[speech_scores$debate_type=="Procedure"])
legislation_model <- lm(as.formula(paste0(style, "~ gender")),
data = speech_scores[speech_scores$debate_type=="Legislation"],
weights = speech_scores$n_words[speech_scores$debate_type=="Legislation"])
opp_bbb_model <- lm(as.formula(paste0(style, "~ gender")),
data = speech_scores[speech_scores$debate_type=="Opposition/Backbench"],
weights = speech_scores$n_words[speech_scores$debate_type=="Opposition/Backbench"])
other_model <- lm(as.formula(paste0(style, "~ gender")),
data = speech_scores[speech_scores$debate_type=="Other"],
weights = speech_scores$n_words[speech_scores$debate_type=="Other"])
out <- list()
out$all <- est_conf_ints(all_model)
out$questions <- est_conf_ints(questions_model)
out$PMQs <- est_conf_ints(PMQs_model)
out$procedure <- est_conf_ints(procedure_model)
out$legislation <- est_conf_ints(legislation_model)
out$opp_bbb <- est_conf_ints(opp_bbb_model)
out$other <- est_conf_ints(other_model)
coef_list[[i]] <- out
}
names(coef_list) <- style_types
coef_list_debate_models <- coef_list
save(coef_list_debate_models, file = "working/debate_model_out.Rdata")
## Output manually constructed dictionaries
anecdote_words <- read.csv("data/dictionaries/seed_words_anecdote.csv", stringsAsFactors = F)[,1]
sink("analysis/anecdote_words.txt")
cat(paste0(anecdote_words, collapse = "; "))
sink()
aggression_words <- read.csv("data/dictionaries/LH_aggression_seed.csv", stringsAsFactors = F)[,1]
sink("analysis/aggression_words.txt")
cat(paste0(aggression_words, collapse = "; "))
sink()
library(margins)
library(tidyverse)
style_types <- c("affect_std","posemo_std", "negemo_std", "fact_std", "anecdote_std", "aggression_std", "complexity_std", "repetition_std")
style_type_labels <- c("Affect", "Pos. Emotion", "Neg. Emotion", "Fact", "Human Narrative", "Aggression", "Complexity", "Repetition")
margins_out_list <- list()
i<-1
for(i in 1:length(style_types)){
print(style_types[i])
x <- speech_scores[,list(N = .N,
y = mean(get(style_types[i])),
gender = unique(gender)),
by = list(person_id, parliamentary_term)]
x[,y:=scale(y), by = parliamentary_term]
mod <- lm(N ~ y * parliamentary_term * gender, data.frame(x))
margins_out <- margins::margins(mod,
at = list(parliamentary_term = unique(x$parliamentary_term),
gender = c("Male", "Female")),
variables = "y")
margins_out <- margins_out %>% summary() %>% as_tibble() %>% filter(!is.na(AME)) %>% as.data.frame()
margins_out$var <- style_type_labels[i]
margins_out_list[[i]] <- margins_out
}
margins_out <- do.call("rbind", margins_out_list)
margins_out$var <- factor(margins_out$var, levels = c("Human Narrative", "Affect", "Pos. Emotion", "Neg. Emotion", "Fact", "Aggression", "Complexity", "Repetition"))
lim <- max(abs(c(margins_out$lower, margins_out$upper)))
lims <- c(-lim, lim)
p1 <- margins_out %>%
ggplot(aes(x = parliamentary_term, y = AME, ymin = lower, ymax = upper, col = gender, linetype = gender)) + geom_errorbar(width = .01, position = position_dodge(.2)) +
geom_point(position = position_dodge(.2)) + theme_bw() +
geom_hline(yintercept = 0, linetype = 2) + xlab("") +
ylab("Effect of average style use on speech count") +
scale_color_manual(values = c("black", "red")) +
theme(legend.title = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0, hjust=1)) +
facet_wrap(~var, nrow = 2, ncol = 4) + ylim(lims)
pdf("analysis/plots/style_use_speech_rate.pdf",12,8)
print(p1)
dev.off()
